Twitter Bot: NLP Emotion Classifier

Deep Learning

Building and deploying an emotion classifying twitter bot that responds to users who prompt the bot with a # of interest. Bot uses a pretrained BERT encoder fine tuned on a tweet emotion dataset.


Jake Gehri


October 7, 2022

! huggingface-cli login
! pip install datasets
from datasets import list_datasets
import tensorflow as tf
from transformers import pipeline, PushToHubCallback
all_datasets = list_datasets()
['acronym_identification', 'ade_corpus_v2', 'adversarial_qa', 'aeslc', 'afrikaans_ner_corpus']
from datasets import load_dataset
emotions = load_dataset('emotion')
train_ds = emotions['train']
    features: ['text', 'label'],
    num_rows: 16000
{'text': 'i didnt feel humiliated', 'label': 0}
import pandas as pd
emotions.set_format(type = 'pandas')
df = emotions['train'][:]
text label
0 i didnt feel humiliated 0
1 i can go from feeling so hopeless to so damned... 0
2 im grabbing a minute to post i feel greedy wrong 3
3 i am ever feeling nostalgic about the fireplac... 2
4 i am feeling grouchy 3
... ... ...
15995 i just had a very brief time in the beanbag an... 0
15996 i am now turning and i feel pathetic that i am... 0
15997 i feel strong and good overall 1
15998 i feel like this was such a rude comment and i... 3
15999 i know a lot but i feel so stupid because i ca... 0

16000 rows × 2 columns

def label_int2str(row):
    return emotions['train'].features['label'].int2str(row)
df['label_name'] = df['label'].apply(label_int2str)
text label label_name
0 i didnt feel humiliated 0 sadness
1 i can go from feeling so hopeless to so damned... 0 sadness
2 im grabbing a minute to post i feel greedy wrong 3 anger
3 i am ever feeling nostalgic about the fireplac... 2 love
4 i am feeling grouchy 3 anger
import matplotlib.pyplot as plt

plt.title("Frequency of Classes")

df['words_per_tweet'] = df['text'].str.split().apply(len)
df.boxplot('words_per_tweet', by='label_name', grid=False, showfliers=False)
! pip install transformers
from transformers import AutoTokenizer

model_checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
emotions_encoded =, batched = True, batch_size = None)
from transformers import TFAutoModelForSequenceClassification

num_labels = 6

tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification at 0x7f50d851c510>
from sklearn.metrics import accuracy_score, f1_score
tokenizer_columns = tokenizer.model_input_names
batch_size = 64
tf_train_dataset = emotions_encoded['train'].to_tf_dataset(columns = tokenizer_columns, 
                                                           label_cols = ['label'], 
                                                           shuffle=True, batch_size=batch_size)

tf_validation_dataset = emotions_encoded['validation'].to_tf_dataset(columns = tokenizer_columns, 
                                                           label_cols = ['label'], 
                                                           shuffle=True, batch_size=batch_size)
callbacks = [PushToHubCallback("model_output/",

                 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics = tf.metrics.SparseCategoricalAccuracy()), validation_data = tf_validation_dataset, epochs = 2, callbacks=callbacks)
classifier = pipeline("text-classification", model = "jakegehri/twitter-emotion-classifier-BERT")
test_tweet = "what is going on"
preds = classifier(test_tweet, top_k=6)
labels = emotions['train'].features['label'].names
emotion_int = int(preds[0]['label'].replace("_"," ").split()[1])
[{'label': 'LABEL_3', 'score': 0.6134325861930847},
 {'label': 'LABEL_4', 'score': 0.3628736138343811},
 {'label': 'LABEL_1', 'score': 0.01299766730517149},
 {'label': 'LABEL_0', 'score': 0.008490157313644886},
 {'label': 'LABEL_5', 'score': 0.0016536037437617779},
 {'label': 'LABEL_2', 'score': 0.0005523563013412058}]
rank = []

for i in preds:
  label = i['label']
  rank.append(int(i['label'].replace("_"," ").split()[1]))
re_rank = []
for i in rank:
['anger', 'fear', 'joy', 'sadness', 'surprise', 'love']
preds_df = pd.DataFrame(preds), 100 * preds_df['score'], color = 'C0')